repo: Clean up staging directory for previous boot IDs
authorColin Walters <walters@verbum.org>
Wed, 13 Jan 2016 16:33:54 +0000 (11:33 -0500)
committerColin Walters (automation) <walters+githubbot@verbum.org>
Mon, 2 May 2016 18:44:44 +0000 (18:44 +0000)
We had a policy of cleaning up all files in `$repo/tmp` older
than one day, but we should really clean up previous bootid staging
directories too, as they can potentially take up a lot of disk space.

https://bugzilla.gnome.org/show_bug.cgi?id=760531

Closes: #170
Approved by: jlebon

Makefile-tests.am
src/libostree/ostree-fetcher.c
src/libostree/ostree-repo-commit.c
src/libostree/ostree-repo-private.h
src/libostree/ostree-repo.c
tests/test-admin-deploy-bootid-gc.sh [new file with mode: 0755]

index 9f24ea1df6221cb27e2aa316ced088e81d9abb10..4986b427c20b58616da75e37095594af33e91f31 100644 (file)
@@ -63,6 +63,7 @@ test_scripts = \
        tests/test-admin-deploy-etcmerge-cornercases.sh \
        tests/test-admin-deploy-uboot.sh \
        tests/test-admin-deploy-grub2.sh \
+       tests/test-admin-deploy-bootid-gc.sh \
        tests/test-admin-instutil-set-kargs.sh \
        tests/test-admin-upgrade-not-backwards.sh \
        tests/test-admin-pull-deploy-commit.sh \
index d7915ba69fdfaf5ae47f2ddf93a8d15fb4a9cf27..3606e8fbe5534734da7a3bf03b6e82ac88d1d571 100644 (file)
@@ -383,7 +383,7 @@ session_thread_request_uri (ThreadClosure *thread_closure,
       if (thread_closure->tmpdir_name == NULL)
         {
           if (!_ostree_repo_allocate_tmpdir (thread_closure->base_tmpdir_dfd,
-                                             "fetcher-",
+                                             OSTREE_REPO_TMPDIR_FETCHER,
                                              &thread_closure->tmpdir_name,
                                              &thread_closure->tmpdir_dfd,
                                              &thread_closure->tmpdir_lock,
index 8e00646c83f0f1845ec232ea645dcefcc1897113..0d7f9bee6e36607464c9066ddd772c3760542f7c 100644 (file)
@@ -1161,7 +1161,6 @@ ostree_repo_prepare_transaction (OstreeRepo     *self,
 {
   gboolean ret = FALSE;
   gboolean ret_transaction_resume = FALSE;
-  g_autofree char *stagedir_boot_id_prefix = NULL;
   g_autofree char *stagedir_name = NULL;
   glnx_fd_close int stagedir_fd = -1;
   g_auto(GLnxDirFdIterator) dfd_iter = { 0, };
@@ -1172,10 +1171,8 @@ ostree_repo_prepare_transaction (OstreeRepo     *self,
 
   self->in_transaction = TRUE;
 
-  stagedir_boot_id_prefix = g_strconcat ("staging-", self->boot_id, "-", NULL);
-
   if (!_ostree_repo_allocate_tmpdir (self->tmp_dir_fd,
-                                     stagedir_boot_id_prefix,
+                                     self->stagedir_prefix,
                                      &self->commit_stagedir_name,
                                      &self->commit_stagedir_fd,
                                      &self->commit_stagedir_lock,
@@ -1281,44 +1278,86 @@ cleanup_tmpdir (OstreeRepo        *self,
                 GError           **error)
 {
   gboolean ret = FALSE;
-  g_autoptr(GFileEnumerator) enumerator = NULL;
+  g_auto(GLnxDirFdIterator) dfd_iter = { 0, };
   guint64 curtime_secs;
 
-  enumerator = g_file_enumerate_children (self->tmp_dir, "standard::name,time::modified",
-                                          G_FILE_QUERY_INFO_NOFOLLOW_SYMLINKS,
-                                          cancellable,
-                                          error);
-  if (!enumerator)
-    goto out;
-
   curtime_secs = g_get_real_time () / 1000000;
 
+  if (!glnx_dirfd_iterator_init_at (self->tmp_dir_fd, ".", TRUE, &dfd_iter, error))
+    goto out;
+
   while (TRUE)
     {
-      GFileInfo *file_info;
-      GFile *path;
-      guint64 mtime;
       guint64 delta;
+      struct dirent *dent;
+      struct stat stbuf;
+      g_auto(GLnxLockFile) lockfile = GLNX_LOCK_FILE_INIT;
+      gboolean did_lock;
 
-      if (!gs_file_enumerator_iterate (enumerator, &file_info, &path,
-                                       cancellable, error))
+      if (!glnx_dirfd_iterator_next_dent (&dfd_iter, &dent, cancellable, error))
         goto out;
-      if (file_info == NULL)
+
+      if (dent == NULL)
         break;
 
-      mtime = g_file_info_get_attribute_uint64 (file_info, "time::modified");
-      if (mtime > curtime_secs)
-        continue;
-      /* Only delete files older than a day.  To do better, we would
-       * need to coordinate between multiple processes in a reliable
-       * fashion.  See
-       * https://bugzilla.gnome.org/show_bug.cgi?id=709115
+      if (TEMP_FAILURE_RETRY (fstatat (dfd_iter.fd, dent->d_name, &stbuf, AT_SYMLINK_NOFOLLOW)) < 0)
+        {
+          if (errno == ENOENT) /* Did another cleanup win? */
+            continue;
+          glnx_set_error_from_errno (error);
+          goto out;
+        }
+
+      /* First, if it's a directory which needs locking, but it's
+       * busy, skip it.
        */
-      delta = curtime_secs - mtime;
-      if (delta > 60*60*24)
+      if (_ostree_repo_is_locked_tmpdir (dent->d_name))
         {
-          if (!glnx_shutil_rm_rf_at (AT_FDCWD, gs_file_get_path_cached (path), cancellable, error))
+          if (!_ostree_repo_try_lock_tmpdir (dfd_iter.fd, dent->d_name,
+                                             &lockfile, &did_lock, error))
             goto out;
+          if (!did_lock)
+            continue;
+        }
+
+      /* If however this is the staging directory for the *current*
+       * boot, then don't delete it now - we may end up reusing it, as
+       * is the point.
+       */
+      if (g_str_has_prefix (dent->d_name, self->stagedir_prefix))
+        continue;
+      else if (g_str_has_prefix (dent->d_name, OSTREE_REPO_TMPDIR_STAGING))
+        {
+          /* But, crucially we can now clean up staging directories
+           * from *other* boots
+           */
+          if (!glnx_shutil_rm_rf_at (dfd_iter.fd, dent->d_name, cancellable, error))
+            goto out;
+        }
+      /* FIXME - move OSTREE_REPO_TMPDIR_FETCHER underneath the
+       * staging/boot-id scheme as well, since all of the "did it get
+       * fsync'd" concerns apply to that as well.  Then we can skip
+       * this special case.
+       */
+      else if (g_str_has_prefix (dent->d_name, OSTREE_REPO_TMPDIR_FETCHER))
+        continue;
+      else
+        {
+          /* Now we do time-based cleanup.  Ignore it if it's somehow
+           * in the future...
+           */
+          if (stbuf.st_mtime > curtime_secs)
+            continue;
+
+          /* Now, we arbitrarily delete files/directories older than a
+           * day, since that's what we were doing before we had locking.
+           */
+          delta = curtime_secs - stbuf.st_mtime;
+          if (delta > 60*60*24)
+            {
+              if (!glnx_shutil_rm_rf_at (dfd_iter.fd, dent->d_name, cancellable, error))
+                goto out;
+            }
         }
     }
 
index 70600f4e4d87d31a9fb4de6c9375afc5c62cde24..273edd1ac94e424abcd456fe2c3c936b4ff659b0 100644 (file)
@@ -48,7 +48,7 @@ typedef enum {
 struct OstreeRepo {
   GObject parent;
 
-  char *boot_id;
+  char *stagedir_prefix;
   int commit_stagedir_fd;
   char *commit_stagedir_name;
   GLnxLockFile commit_stagedir_lock;
@@ -108,6 +108,9 @@ typedef struct {
   char checksum[65];
 } OstreeDevIno;
 
+#define OSTREE_REPO_TMPDIR_STAGING "staging-"
+#define OSTREE_REPO_TMPDIR_FETCHER "fetcher-"
+
 gboolean
 _ostree_repo_allocate_tmpdir (int           tmpdir_dfd,
                               const char   *tmpdir_prefix,
@@ -118,6 +121,16 @@ _ostree_repo_allocate_tmpdir (int           tmpdir_dfd,
                               GCancellable *cancellable,
                               GError      **error);
 
+gboolean
+_ostree_repo_is_locked_tmpdir (const char *filename);
+
+gboolean
+_ostree_repo_try_lock_tmpdir (int            tmpdir_dfd,
+                              const char    *tmpdir_name,
+                              GLnxLockFile  *file_lock_out,
+                              gboolean      *out_did_lock,
+                              GError       **error);
+
 gboolean
 _ostree_repo_ensure_loose_objdir_at (int             dfd,
                                      const char     *loose_path,
index a34e2ec163e9df0308df1187c7dd8922d8b67283..2c9a7fd32834e757f8291e99f69f219c54c1fede 100644 (file)
@@ -617,7 +617,7 @@ ostree_repo_finalize (GObject *object)
 
   g_clear_object (&self->parent_repo);
 
-  g_free (self->boot_id);
+  g_free (self->stagedir_prefix);
   g_clear_object (&self->repodir);
   if (self->repo_dir_fd != -1)
     (void) close (self->repo_dir_fd);
@@ -2428,22 +2428,26 @@ ostree_repo_open (OstreeRepo    *self,
   if (self->inited)
     return TRUE;
 
-  /* We use a per-boot identifier to keep track of which file contents
-   * possibly haven't been sync'd to disk.
+  /* We use a directory of the form `staging-${BOOT_ID}-${RANDOM}`
+   * where if the ${BOOT_ID} doesn't match, we know file contents
+   * possibly haven't been sync'd to disk and need to be discarded.
    */
   { const char *env_bootid = getenv ("OSTREE_BOOTID");
+    g_autofree char *boot_id = NULL;
 
     if (env_bootid != NULL)
-      self->boot_id = g_strdup (env_bootid);
+      boot_id = g_strdup (env_bootid);
     else
       {
         if (!g_file_get_contents ("/proc/sys/kernel/random/boot_id",
-                                  &self->boot_id,
+                                  &boot_id,
                                   NULL,
                                   error))
           goto out;
-        g_strdelimit (self->boot_id, "\n", '\0');
+        g_strdelimit (boot_id, "\n", '\0');
       }
+
+    self->stagedir_prefix = g_strconcat (OSTREE_REPO_TMPDIR_STAGING, boot_id, "-", NULL);
   }
 
   if (!glnx_opendirat (AT_FDCWD, gs_file_get_path_cached (self->repodir), TRUE,
@@ -5003,6 +5007,50 @@ ostree_repo_regenerate_summary (OstreeRepo     *self,
   return ret;
 }
 
+gboolean
+_ostree_repo_is_locked_tmpdir (const char *filename)
+{
+  return g_str_has_prefix (filename, OSTREE_REPO_TMPDIR_STAGING) ||
+    g_str_has_prefix (filename, OSTREE_REPO_TMPDIR_FETCHER);
+}
+
+gboolean
+_ostree_repo_try_lock_tmpdir (int            tmpdir_dfd,
+                              const char    *tmpdir_name,
+                              GLnxLockFile  *file_lock_out,
+                              gboolean      *out_did_lock,
+                              GError       **error)
+{
+  gboolean ret = FALSE;
+  g_autofree char *lock_name = g_strconcat (tmpdir_name, "-lock", NULL);
+  gboolean did_lock = FALSE;
+  g_autoptr(GError) local_error = NULL;
+
+  /* We put the lock outside the dir, so we can hold the lock
+   * until the directory is fully removed */
+  if (!glnx_make_lock_file (tmpdir_dfd, lock_name, LOCK_EX | LOCK_NB,
+                            file_lock_out, &local_error))
+    {
+      if (g_error_matches (local_error, G_IO_ERROR, G_IO_ERROR_WOULD_BLOCK))
+        {
+          did_lock = FALSE;
+        }
+      else
+        {
+          g_propagate_error (error, g_steal_pointer (&local_error));
+          goto out;
+        }
+    }
+  else
+    {
+      did_lock = TRUE;
+    }
+
+  ret = TRUE;
+  *out_did_lock = did_lock;
+ out:
+  return ret;
+}
 
 /* This allocates and locks a subdir of the repo tmp dir, using an existing
  * one with the same prefix if it is not in use already. */
@@ -5016,14 +5064,18 @@ _ostree_repo_allocate_tmpdir (int tmpdir_dfd,
                               GCancellable *cancellable,
                               GError **error)
 {
+  gboolean ret = FALSE;
   gboolean reusing_dir = FALSE;
+  gboolean did_lock;
   g_autofree char *tmpdir_name = NULL;
   glnx_fd_close int tmpdir_fd = -1;
   g_auto(GLnxDirFdIterator) dfd_iter = { 0, };
 
+  g_return_val_if_fail (_ostree_repo_is_locked_tmpdir (tmpdir_prefix), FALSE);
+
   /* Look for existing tmpdir (with same prefix) to reuse */
   if (!glnx_dirfd_iterator_init_at (tmpdir_dfd, ".", FALSE, &dfd_iter, error))
-    return FALSE;
+    goto out;
 
   while (tmpdir_name == NULL)
     {
@@ -5031,10 +5083,9 @@ _ostree_repo_allocate_tmpdir (int tmpdir_dfd,
       struct dirent *dent;
       glnx_fd_close int existing_tmpdir_fd = -1;
       g_autoptr(GError) local_error = NULL;
-      g_autofree char *lock_name = NULL;
 
       if (!glnx_dirfd_iterator_next_dent (&dfd_iter, &dent, cancellable, error))
-        return FALSE;
+        goto out;
 
       if (dent == NULL)
         break;
@@ -5055,25 +5106,18 @@ _ostree_repo_allocate_tmpdir (int tmpdir_dfd,
           else
             {
               g_propagate_error (error, g_steal_pointer (&local_error));
-              return FALSE;
+              goto out;
             }
         }
 
-      lock_name = g_strconcat (dent->d_name, "-lock", NULL);
-
       /* We put the lock outside the dir, so we can hold the lock
        * until the directory is fully removed */
-      if (!glnx_make_lock_file (dfd_iter.fd, lock_name, LOCK_EX | LOCK_NB,
-                                file_lock_out, &local_error))
-        {
-          if (g_error_matches (local_error, G_IO_ERROR, G_IO_ERROR_WOULD_BLOCK))
-            continue;
-          else
-            {
-              g_propagate_error (error, g_steal_pointer (&local_error));
-              return FALSE;
-            }
-        }
+      if (!_ostree_repo_try_lock_tmpdir (dfd_iter.fd, dent->d_name,
+                                         file_lock_out, &did_lock,
+                                         error))
+        goto out;
+      if (!did_lock)
+        continue;
 
       /* Touch the reused directory so that we don't accidentally
        *   remove it due to being old when cleaning up the tmpdir
@@ -5091,32 +5135,24 @@ _ostree_repo_allocate_tmpdir (int tmpdir_dfd,
       g_autofree char *tmpdir_name_template = g_strconcat (tmpdir_prefix, "XXXXXX", NULL);
       glnx_fd_close int new_tmpdir_fd = -1;
       g_autoptr(GError) local_error = NULL;
-      g_autofree char *lock_name = NULL;
 
       /* No existing tmpdir found, create a new */
 
       if (!glnx_mkdtempat (tmpdir_dfd, tmpdir_name_template, 0777, error))
-        return FALSE;
+        goto out;
 
       if (!glnx_opendirat (tmpdir_dfd, tmpdir_name_template, FALSE,
                            &new_tmpdir_fd, error))
-        return FALSE;
-
-      lock_name = g_strconcat (tmpdir_name_template, "-lock", NULL);
+        goto out;
 
       /* Note, at this point we can race with another process that picks up this
        * new directory. If that happens we need to retry, making a new directory. */
-      if (!glnx_make_lock_file (tmpdir_dfd, lock_name, LOCK_EX | LOCK_NB,
-                                file_lock_out, &local_error))
-        {
-          if (g_error_matches (local_error, G_IO_ERROR, G_IO_ERROR_WOULD_BLOCK))
-            continue;
-          else
-            {
-              g_propagate_error (error, g_steal_pointer (&local_error));
-              return FALSE;
-            }
-        }
+      if (!_ostree_repo_try_lock_tmpdir (tmpdir_dfd, tmpdir_name_template,
+                                         file_lock_out, &did_lock,
+                                         error))
+        goto out;
+      if (!did_lock)
+        continue;
 
       tmpdir_name = g_steal_pointer (&tmpdir_name_template);
       tmpdir_fd = glnx_steal_fd (&new_tmpdir_fd);
@@ -5131,5 +5167,7 @@ _ostree_repo_allocate_tmpdir (int tmpdir_dfd,
   if (reusing_dir_out)
     *reusing_dir_out = reusing_dir;
 
-  return TRUE;
+  ret = TRUE;
+ out:
+  return ret;
 }
diff --git a/tests/test-admin-deploy-bootid-gc.sh b/tests/test-admin-deploy-bootid-gc.sh
new file mode 100755 (executable)
index 0000000..ba16f33
--- /dev/null
@@ -0,0 +1,58 @@
+#!/bin/bash
+#
+# Copyright (C) 2016 Colin Walters <walters@verbum.org>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the
+# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+# Boston, MA 02111-1307, USA.
+
+set -euo pipefail
+
+. $(dirname $0)/libtest.sh
+
+# Exports OSTREE_SYSROOT so --sysroot not needed.
+setup_os_repository "archive-z2" "syslinux"
+
+echo "1..1"
+
+${CMD_PREFIX} ostree --repo=sysroot/ostree/repo pull-local --remote=testos testos-repo testos/buildmaster/x86_64-runtime
+rev=$(${CMD_PREFIX} ostree --repo=sysroot/ostree/repo rev-parse testos/buildmaster/x86_64-runtime)
+export rev
+${CMD_PREFIX} ostree admin deploy --karg=root=LABEL=MOO --karg=quiet --os=testos testos:testos/buildmaster/x86_64-runtime
+os_repository_new_commit
+
+rm sysroot/ostree/repo/tmp/* -rf
+export TEST_BOOTID=4072029c-8b10-60d1-d31b-8422eeff9b42
+if env OSTREE_REPO_TEST_ERROR=pre-commit OSTREE_BOOTID=${TEST_BOOTID} \
+       ${CMD_PREFIX} ostree admin deploy --karg=root=LABEL=MOO --karg=quiet --os=testos testos:testos/buildmaster/x86_64-runtime 2>err.txt; then
+    assert_not_reached "Should have hit OSTREE_REPO_TEST_ERROR_PRE_COMMIT"
+fi
+stagepath=$(ls -d sysroot/ostree/repo/tmp/staging-${TEST_BOOTID}-*)
+assert_has_dir "${stagepath}"
+
+# We have an older failed stage, now use a new boot id
+
+export NEW_TEST_BOOTID=5072029c-8b10-60d1-d31b-8422eeff9b42
+if env OSTREE_REPO_TEST_ERROR=pre-commit OSTREE_BOOTID=${NEW_TEST_BOOTID} \
+       ${CMD_PREFIX} ostree admin deploy --karg=root=LABEL=FOO --karg=quiet --os=testos testos:testos/buildmaster/x86_64-runtime 2>err.txt; then
+    assert_not_reached "Should have hit OSTREE_REPO_TEST_ERROR_PRE_COMMIT"
+fi
+newstagepath=$(ls -d sysroot/ostree/repo/tmp/staging-${NEW_TEST_BOOTID}-*)
+assert_has_dir "${newstagepath}"
+env OSTREE_BOOTID=${NEW_TEST_BOOTID} ${CMD_PREFIX} ostree admin deploy --karg=root=LABEL=MOO --karg=quiet --os=testos testos:testos/buildmaster/x86_64-runtime
+newstagepath=$(ls -d sysroot/ostree/repo/tmp/staging-${NEW_TEST_BOOTID}-*)
+assert_not_has_dir "${stagepath}"
+assert_not_has_dir "${newstagepath}"
+
+echo "ok admin bootid GC"